# ------------------------------------------------------------------------------
# Clean demographics table for analysis
# Author: Cassidy Shubatt <cshubatt@gmail.com>
# To run: bsub -q big -R "rusage[mem=10000]" bash 02_clean_demographics.sh
# ------------------------------------------------------------------------------

# Setup ------------------------------------------------------------------------
library(yaml) # read_yaml absolute filepaths
library(data.table)
library(here) # here() relative filepaths
library(testit) # assert function
library(tidyverse)
library(glue)
library(RSQLite) # connect to xwalk database
library(feather) # read_feather
library(reticulate) # source_python
library(fastDummies) # dummy_cols()

u <- modules::use(here("lib", "util.R"))
source_python(here::here("lib", "python_util.py")) # decode_census_data()

# Load Data --------------------------------------------------------------------
message("Loading data...")
paths <- read_yaml(here("lib", "filepaths.yml"))

mydb <- dbConnect(RSQLite::SQLite(), paths$features$db_xwalked)
res <- dbSendQuery(mydb, "SELECT * FROM dem_xwalked")
demos <- dbFetch(res) %>%
  unique()
dbClearResult(res)
dbDisconnect(mydb)

# Income Data ------------------------------------------------------------------
message("Reading census df...")
# have to decode bytes in python, not sure how to load in R otherwise
census_income <- decode_census_data(paths$raw_cohort$census_income) %>%
  mutate(median_income = as.numeric(B19013_001)) %>%
  select(empi, median_income) %>%
  unique() %>%
  group_by(empi) %>%
  summarize(median_income = median_income[1]) %>%
  ungroup()
message("Reading ptid xwalk...")
income_xwalk <- read_feather(paths$raw_cohort$ptid_xwalk) %>%
  select(-bwh_mrn) %>%
  unique() %>%
  u$safe_left_join(census_income) %>%
  select(ptid, median_income) %>%
  group_by(ptid) %>%
  summarize(median_income = median_income[1]) %>%
  ungroup()

# Clean ------------------------------------------------------------------------
message("Cleaning demographics...")
# names(demos) <- str_replace(names(demos), "dem_", "")
income_levels <- c(
  "under_25K", "25K_to_50K", "50K_to_75K", "75K_to_100K", "100K_to_200K",
  "above_200K"
)

analysis_dems <- demos %>%
  mutate(race = case_when(
    race_black == 1 ~ "black",
    race_hispanic == 1 ~ "hispanic",
    race_other == 1 ~ "other",
    TRUE ~ "white"
  )) %>%
  mutate(
    race = factor(race, levels = c("white", "black", "hispanic", "other"))
  ) %>%
  setnames("age", "age_at_admit") %>%
  mutate(
    age = case_when(
      age_at_admit < 50 ~ "49_or_under",
      age_at_admit < 55 ~ "50_to_54",
      age_at_admit < 60 ~ "55_to_59",
      age_at_admit < 65 ~ "60_to_64",
      age_at_admit < 70 ~ "65_to_69",
      age_at_admit < 75 ~ "70_to_74",
      age_at_admit < 80 ~ "75_to_79",
      age_at_admit < 85 ~ "80_to_84",
      age_at_admit < 90 ~ "85_to_89",
      TRUE ~ "90_or_over"
    )
  ) %>%
  mutate(days_to_death = death_date - t0) %>%
  mutate(death_030_day = replace_na(days_to_death <= 30, FALSE)) %>%
  mutate(death_060_day = replace_na(days_to_death <= 60, FALSE)) %>%
  mutate(death_180_day = replace_na(days_to_death <= 180, FALSE)) %>%
  mutate(death_365_day = replace_na(days_to_death <= 365, FALSE)) %>%
  select(-t0, -dummydate, -birth_date, -chunk1K, -chunk10K, -age_days) %>%
  u$safe_left_join(income_xwalk) %>%
  #
  # income_df <- analysis_dems %>%
  mutate(
    income = case_when(
      median_income < 25000 ~ "under_25K",
      median_income < 50000 ~ "25K_to_50K",
      median_income < 75000 ~ "50K_to_75K",
      median_income < 100000 ~ "75K_to_100K",
      median_income < 200000 ~ "100K_to_200K",
      TRUE ~ "above_200K"
    )
  ) %>%
  mutate(income = ifelse(is.na(median_income), NA, income)) %>%
  mutate(income = factor(income, levels = income_levels)) %>%
  dummy_cols(select_columns = c("race", "income"))

write_rds(analysis_dems, paths$analysis$demographics)

message("Done.")
